library(tidyverse) #list csv all csv filenames <- list.files(pattern = 'csv') #names for datasets in r environment names <- str_replace_all(filenames, '.csv', "") #names for all release pheno datasets names2 <- names[2:length(names)] #read in datasets for (i in 1:length(filenames)) { assign(names[i], read_csv(filenames[i])) } #remove the ,assessment string from Diagnosis dataset ID column and call it EID Diagnosis_ClinicianConsensus <- Diagnosis_ClinicianConsensus %>% mutate(EID = str_replace_all(Identifiers,",assessment", "")) #generate review dataset for numbers before correction yes <- list() no <- list() for (i in 1:length(names2)) { a <- eval(sym(names2[[i]])) b <- table(a[['Full_Pheno']]) no[i] <- b[['No']] yes[i] <- b[['Yes']] rm(a,b) } review <- tibble(dataset = names2, yes = as.numeric(yes), no = as.numeric(no)) rm(yes, no) #correct Full_Pheno column for (i in names2) { assign(i, eval(sym(i)) %>% mutate(Full_Pheno = ifelse(EID %in% Diagnosis_ClinicianConsensus$EID, "Yes", "No")) ) } #turn Full_Pheno columns in all datasets into factor with levels c("yes", "no) for (i in names2) { assign(i, eval(sym(i)) %>% mutate(Full_Pheno = factor(Full_Pheno, levels = c("Yes", "No"))) ) } #get new numbers yes_corrected <- list() no_corrected <- list() for (i in 1:length(names2)) { a <- eval(sym(names2[[i]])) b <- table(a[['Full_Pheno']]) no_corrected[i] <- b[['No']] yes_corrected[i] <- b[['Yes']] rm(a,b) } review <- review %>% mutate(yes_corrected = as.numeric(yes_corrected), no_corrected = as.numeric(no_corrected), n_corrected = yes_corrected - yes) rm(yes_corrected, no_corrected) #write csvs #review table write_csv(review, 'review.csv') #write all updated pheno datasets filelist <- list() for (i in 1:length(names2)) { filelist[[i]] <- eval(sym(names2[[i]])) } filenames2 <- filenames[2:length(filenames)] for (i in 1:length(filelist)) { write_csv(filelist[[i]], filenames2[[i]]) }